As a part of my visualization project work, I started to explore on suicide rates globally and the chance of it’s relation with the happiness score(Data set is availble in the kaggle itself and has been attached to the data section for reference). I worked as a data engineer with honeywell for two years, and currently am pursuing my masters in Data Analytics. Please find my research work here
This project started with the paradox of “Suicide in happy places”. The economists ‘explanation for this paradox was that people tended to compare themselves to those around them—if you’re an unhappy person in a happy place, your negative feelings might be exacerbated by your positive surroundings, which could lead to suicide’ The other key aspects of this report is to use high level packages available in R to build visualizations of suicide rates globally and explaining the thought process went behind in generating a suitable visualization.
#Data Cleaning
data_s<- read.csv("/Users/vineeth/Desktop/master.csv")
data_h<- read.csv("/Users/vineeth/Desktop/Happiness.csv")
data_s<-data_s[-c(8,9)] # Drropping columns which has missing values almost everywhere.
data_s<-data_s %>%
rename(
Country = country,
Year = year,
Sex= sex,
Age = age,
Pop = population,
S_N = suicides_no,
GDP = gdp_for_year....,
GDP_C = gdp_per_capita....,
Gen = generation
)
#Reordering age as aa Ordinal variable:
data_s$Age <- factor(data_s$Age,
levels = c("5-14 years",
"15-24 years",
"25-34 years",
"35-54 years",
"55-74 years",
"75+ years"))
#We can see from data that 2016 column is mostly NA so filtering the same
data_s <- data_s %>%
filter(Year != 2016)
# 3) TIDYING DATAFRAME
data_s$Age <- gsub(" years", "", data_s$Age)
data_s$Sex <- ifelse(data_s$Sex == "male", "Male", "Female")
# Adding continent to data:
data_s$Continent <- countrycode(sourcevar = data_s[, "Country"],
origin = "country.name",
destination = "continent") Insights
# Create tibble for sex so we can use it when creating our line plot.
Gender_filter<- data_s %>%
select(Year, Sex, S_N, Pop) %>%
group_by(Year, Sex) %>%
summarise(S_C = round((sum(S_N)/sum(Pop))*100000, 2))
Gender_filter$Sex<- as.factor(Gender_filter$Sex)
ggplot(Gender_filter, aes(x = Year, y = S_C)) +
geom_line(aes(color = Sex)) +
scale_color_manual(values = c("darkred", "steelblue"))+
geom_hline(yintercept = worldwide_suc_avg, linetype = 2, color = "antiquewhite4", size = 1) +
labs(title = "Global Suicides (per 100k)",
subtitle = "Trend over time, 1985 - 2015.",
x = "Year",
y = "Suicides per 100k") +
scale_x_continuous(breaks = seq(1985, 2015, 2)) +
scale_y_continuous(breaks = seq(10, 20)) +
transition_reveal(as.numeric(Year))Age_time <- data_s %>%
group_by(Year, Age) %>%
summarize(suicide_per_100k = (sum(as.numeric(S_N)) / sum(as.numeric(Pop))) * 100000)
#continent_time$Continent <- factor(continent_time$Continent, ordered = T, levels = data_s$Continent)
Age_time_plot <- ggplot(Age_time, aes(x = Year, y = suicide_per_100k, col = factor(Age))) +
geom_line() +
geom_point() +
labs(title = "Age wise Sucide rate trend by year",
x = "Year",
y = "Suicides per 100k",
color = "Age") +
theme(legend.position = "none", title = element_text(size = 10)) +
scale_x_continuous(breaks = seq(1985, 2015,4), minor_breaks = F) +
transition_reveal(as.numeric(Year))
Age_time_plot+theme_dark() Insights
Global_data <- data_s %>%
select(Year, S_N, Pop,Continent,GDP_C) %>%
group_by(Year,Continent) %>%
summarise(S_C = round((sum(S_N)/sum(Pop))*100000, 2), Pop = mean(Pop), GDP_C = mean(GDP_C))
Global_data$Continent <- as.factor(Global_data$Continent)
p <- ggplot(
Global_data,
aes(x = Global_data$GDP_C, y=Global_data$S_C, size = Pop, col = factor(Continent)
)) +
geom_point() +
scale_color_viridis_d() +
scale_size(range = c(2, 12)) +
scale_x_log10() +
labs(x = "GDP per capita", y = "Sucide per capita")
p + transition_time(Global_data$Year) +
labs(title = "Year: {frame_time}",
size = "Population",
color = "Continent") + shadow_wake(0.5)Insights
continent_time <- data_s %>%
group_by(Year, Continent) %>%
summarize(suicide_per_100k = (sum(as.numeric(S_N)) / sum(as.numeric(Pop))) * 100000)
#continent_time$Continent <- factor(continent_time$Continent, ordered = T, levels = data_s$Continent)
continent_time_plot <- ggplot(continent_time, aes(x = Year, y = suicide_per_100k, col = factor(Continent))) +
geom_line() +
geom_point() +
labs(title = "Continental Sucide rate trend by year",
x = "Year",
y = "Suicides per 100k",
color = "Continent") +
theme(legend.position = "none", title = element_text(size = 10)) +
scale_x_continuous(breaks = seq(1985, 2015,4), minor_breaks = F) +
transition_reveal(as.numeric(Year))
continent_time_plot + theme_dark()country_time <- data_s %>%
group_by(Year, Country) %>%
summarize(suicide_per_100k = (sum(as.numeric(S_N)) / sum(as.numeric(Pop))) * 100000)
p3 <- ggplot(country_time,
aes(x = Country,
y = suicide_per_100k)) +
theme(legend.position="top",
axis.text=element_text(size = 6),
axis.text.x = element_text(angle = 90))
p4 <- p3 + geom_point(aes(color = Year),
alpha = 0.5,
size = 1.5,
position = position_jitter(width = 0.25, height = 0))
p4 +
scale_color_continuous(name="",
breaks = c(1985, 1995, 2005, 2015),
labels = c("'85", "'95", "'05","'15"),
low = muted("blue"), high = muted("red")) +transition_reveal(as.numeric(Year)) + shadow_trail()